import data

bcdata = read_csv("bcdata_Assignment1.csv") |> janitor::clean_names()

Q1 get quantitative fesures

data_type = str(bcdata) # all variables are numeric.
## spc_tbl_ [116 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age           : num [1:116] 48 83 82 68 86 49 89 76 73 75 ...
##  $ bmi           : num [1:116] 23.5 20.7 23.1 21.4 21.1 ...
##  $ glucose       : num [1:116] 70 92 91 77 92 92 77 118 97 83 ...
##  $ insulin       : num [1:116] 2.71 3.12 4.5 3.23 3.55 ...
##  $ homa          : num [1:116] 0.467 0.707 1.01 0.613 0.805 ...
##  $ leptin        : num [1:116] 8.81 8.84 17.94 9.88 6.7 ...
##  $ adiponectin   : num [1:116] 9.7 5.43 22.43 7.17 4.82 ...
##  $ resistin      : num [1:116] 8 4.06 9.28 12.77 10.58 ...
##  $ mcp_1         : num [1:116] 417 469 555 928 774 ...
##  $ classification: num [1:116] 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   BMI = col_double(),
##   ..   Glucose = col_double(),
##   ..   Insulin = col_double(),
##   ..   HOMA = col_double(),
##   ..   Leptin = col_double(),
##   ..   Adiponectin = col_double(),
##   ..   Resistin = col_double(),
##   ..   MCP.1 = col_double(),
##   ..   Classification = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary = skimr::skim(bcdata)# There's no missing in this dataset.
summary
Data summary
Name bcdata
Number of rows 116
Number of columns 10
_______________________
Column type frequency:
numeric 10
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
age 0 1 57.30 16.11 24.00 45.00 56.00 71.00 89.00 ▃▇▅▇▃
bmi 0 1 27.58 5.02 18.37 22.97 27.66 31.24 38.58 ▅▆▇▅▃
glucose 0 1 97.79 22.53 60.00 85.75 92.00 102.00 201.00 ▅▇▁▁▁
insulin 0 1 10.01 10.07 2.43 4.36 5.92 11.19 58.46 ▇▁▁▁▁
homa 0 1 2.69 3.64 0.47 0.92 1.38 2.86 25.05 ▇▁▁▁▁
leptin 0 1 26.62 19.18 4.31 12.31 20.27 37.38 90.28 ▇▃▂▁▁
adiponectin 0 1 10.18 6.84 1.66 5.47 8.35 11.82 38.04 ▇▅▂▁▁
resistin 0 1 14.73 12.39 3.21 6.88 10.83 17.76 82.10 ▇▂▁▁▁
mcp_1 0 1 534.65 345.91 45.84 269.98 471.32 700.08 1698.44 ▇▇▃▁▁
classification 0 1 1.55 0.50 1.00 1.00 2.00 2.00 2.00 ▆▁▁▁▇

Q2 catogorize bmi based on WHO criteria

q2 = bcdata |> mutate(who_bmi =
                   ifelse(bmi < 16.5, "Severely underweight",
                          ifelse(16.5 <= bmi & bmi < 18.5, "Underweight",
                                 ifelse(18.5 <= bmi & bmi < 25, "Normal weight",
                                        ifelse(25 <= bmi & bmi < 30, "Overweight", 
                                               ifelse(30 <= bmi & bmi < 35, "Obesity class I",
                                                      ifelse(35 <= bmi & bmi < 40, "Obesity class II","Obesity class III"))))))) |> 
  mutate(classification = recode(classification, "1" = "Healthy Controls", "2" = "Breast Cancer Patients")) |> arrange(bmi)



#check if I have recoded bmi correctly
table(q2$bmi, q2$who_bmi)

Q3 draw bar plot

#Since there's no healthy controls in underweight category, I added a category "underweight healthhy controls" with 0 count to make sure every column has the same width.
freq_table = q2 |> group_by(who_bmi, classification) |>
  summarise(n = n()) |> mutate(proportion = n / sum(n) * 100)

supp = data.frame(who_bmi = "Underweight", 
              classification = "Healthy Controls",
              n = 0, proportion = 0)
final_freq = bind_rows(supp, freq_table) |> mutate(who_bmi = as.factor(who_bmi)) 

level = c("Severely underweight", "Underweight", "Normal weight", "Overweight", "Obesity class I", "Obesity class II", "Obesity class III")
final_freq |> 
  mutate(who_bmi = forcats::fct_relevel(who_bmi, level),
         text_label = str_c(proportion, "%")) |>
  plot_ly(x = ~who_bmi, y = ~proportion, color = ~classification, type = "bar", colors = "viridis", text = ~text_label)

But honestly, I think a barchart showing porportion of the whole sample within each category.(each category doesn’t accounted for 1) is more informative.

Q4 regression model

q4 = bcdata |> 
  mutate(classification = recode(classification, "1" = 0, "2" = 1))
# recode 1-healthy controls as "0", 2-breast cancer patients as "1"
table(q4$classification, bcdata$classification)
##    
##      1  2
##   0 52  0
##   1  0 64
# recoded correctly

mylogit = glm(classification ~ glucose + homa + leptin + bmi + age, data = q4, family = "binomial")
mylogit |> broom::tidy()
## # A tibble: 6 × 5
##   term        estimate std.error statistic  p.value
##   <chr>          <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept) -3.63       2.36      -1.54  0.124   
## 2 glucose      0.0817     0.0235     3.47  0.000515
## 3 homa         0.274      0.172      1.59  0.111   
## 4 leptin      -0.00857    0.0158    -0.543 0.587   
## 5 bmi         -0.104      0.0566    -1.84  0.0657  
## 6 age         -0.0229     0.0144    -1.59  0.111
cbind(estimate = coef(mylogit), confint(mylogit))
## Waiting for profiling to be done...
##                 estimate       2.5 %      97.5 %
## (Intercept) -3.626064816 -8.54138756 0.754487774
## glucose      0.081698730  0.03956613 0.132397841
## homa         0.273882199 -0.02555240 0.653222623
## leptin      -0.008573804 -0.04019445 0.022416142
## bmi         -0.104260515 -0.21944692 0.004398024
## age         -0.022880956 -0.05192184 0.004856327
exp(cbind(OR = coef(mylogit), confint(mylogit)))
## Waiting for profiling to be done...
##                     OR        2.5 %   97.5 %
## (Intercept) 0.02662074 0.0001952192 2.126522
## glucose     1.08512884 1.0403592957 1.141562
## homa        1.31505988 0.9747713042 1.921724
## leptin      0.99146285 0.9606026348 1.022669
## bmi         0.90099055 0.8029627758 1.004408
## age         0.97737883 0.9494030652 1.004868

Q5 linear regression

q5 = bcdata
fit = lm(insulin ~ bmi + age + glucose, data = q5)
fit |> broom::tidy()
## # A tibble: 4 × 5
##   term        estimate std.error statistic      p.value
##   <chr>          <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept) -13.5       5.86      -2.30  0.0231      
## 2 bmi           0.150     0.164      0.914 0.363       
## 3 age          -0.0540    0.0519    -1.04  0.301       
## 4 glucose       0.230     0.0375     6.13  0.0000000137
cbind(estimate = coef(fit), confint(fit))
##                 estimate       2.5 %      97.5 %
## (Intercept) -13.49575923 -25.1054353 -1.88608318
## bmi           0.14969033  -0.1748942  0.47427491
## age          -0.05402168  -0.1569321  0.04888876
## glucose       0.22981790   0.1554864  0.30414939